Hello world

Libraries that are used

#install.packages('dplyr')
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#install.packages('lubridate')
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
#install.packages('tidyr')
library(tidyr)
#install.packages('ggplot2')
library(ggplot2)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble  3.0.5     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## v purrr   0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date()        masks base::date()
## x dplyr::filter()          masks stats::filter()
## x lubridate::intersect()   masks base::intersect()
## x dplyr::lag()             masks stats::lag()
## x lubridate::setdiff()     masks base::setdiff()
## x lubridate::union()       masks base::union()
#install.packages('gganimate')
library(gganimate)
#install.packages('gifski')
library(gifski)
#install.packages('av')
library(av)
#install.packages('ggmap')
library(ggmap)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
#install.packages('shiny')
library(shiny)
#install.packages('leaflet')
library(leaflet)

Reading the data

df_trees_orig <- read.csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-28/sf_trees.csv')

We create a copy of data to preserve its original form in case we want to look at it for reference

df_trees <- df_trees_orig

Data Cleaning

We look at the data structure, and see all the string variables are saved as char. We would rather have them as factors for potential modelling use.

str(df_trees)
## 'data.frame':    192987 obs. of  12 variables:
##  $ tree_id     : int  53719 30313 30312 30314 30315 30316 48435 30319 30318 30320 ...
##  $ legal_status: chr  "Permitted Site" "Permitted Site" "Permitted Site" "DPW Maintained" ...
##  $ species     : chr  "Tree(s) ::" "Tree(s) ::" "Tree(s) ::" "Pittosporum undulatum :: Victorian Box" ...
##  $ address     : chr  "2963 Webster St" "501 Arkansas St" "501 Arkansas St" "501 Arkansas St" ...
##  $ site_order  : int  1 3 2 1 5 6 4 2 1 3 ...
##  $ site_info   : chr  "Sidewalk: Curb side : Cutout" "Sidewalk: Curb side : Cutout" "Sidewalk: Curb side : Cutout" "Sidewalk: Curb side : Cutout" ...
##  $ caretaker   : chr  "Private" "Private" "Private" "Private" ...
##  $ date        : chr  "1955-09-19" "1955-10-20" "1955-10-20" "1955-10-20" ...
##  $ dbh         : int  NA NA NA 16 NA NA NA NA NA NA ...
##  $ plot_size   : chr  NA NA NA NA ...
##  $ latitude    : num  37.8 37.8 37.8 37.8 37.8 ...
##  $ longitude   : num  -122 -122 -122 -122 -122 ...

Changing columns with type character to type factor. you need double brackets because single bracket indexing returns a df object, which always evaluates to FALSE. Double brackets returns you the column as a vector whose data type can be tested. Alternatively, we can use single brackets but explicitly calling all rows and columns i.e [,i]

for (i in 1:12){ 
    if (is.character(df_trees[,i]) == TRUE) {
    df_trees[,i]<-as.factor(df_trees[,i])
  }
  }

date column has char data type. We change it to date.

# df_trees$date <- as.Date(df_trees$date,format="%Y-%m-%d") #using base r

df_trees$date <- lubridate::ymd(df_trees$date) #using lubridate

save the planted year of the tree as a separate column

# df_trees$year <- format(df_trees$date, "%Y")  #using base r

df_trees$year <- year(df_trees$date) # using lubridate
class(df_trees$year)
## [1] "numeric"

take a look at the format of species name in species column.

head(unique(df_trees$species))
## [1] Tree(s) ::                               
## [2] Pittosporum undulatum :: Victorian Box   
## [3] Acacia melanoxylon :: Blackwood Acacia   
## [4] Magnolia grandiflora :: Southern Magnolia
## [5] Corymbia ficifolia :: Red Flowering Gum  
## [6] Ginkgo biloba :: Maidenhair Tree         
## 571 Levels: :: ... Ziziphus jujuba :: Jujube

we see that for each species, the species column reports both the latin and common names. For future analysis, it might be more convenient to split the latin and common names into respective columns.

Separate the species column into latin and common name columns.

df_trees <- df_trees %>% 
  separate(species, sep = "::", remove = FALSE, into = c('species_lat', 'species_nor'))

Check the percentage of missing values in each column

for (col in colnames(df_trees)){
  print(paste(col, "=", mean(is.na(df_trees[[col]])))) #mean takes percentage here because adds up all TRUES (since their value is 1)
                                                       #and divides by total number of values (TRUE + FALSE)
}
## [1] "tree_id = 0"
## [1] "legal_status = 0.000279811593527025"
## [1] "species = 0"
## [1] "species_lat = 0"
## [1] "species_nor = 0"
## [1] "address = 0.00770518221434604"
## [1] "site_order = 0.00846689155228072"
## [1] "site_info = 0"
## [1] "caretaker = 0"
## [1] "date = 0.645691160544493"
## [1] "dbh = 0.216693352401975"
## [1] "plot_size = 0.259152170871613"
## [1] "latitude = 0.0146745635716395"
## [1] "longitude = 0.0146745635716395"
## [1] "year = 0.645691160544493"

Column with most significant amount of missing data is the date column.

We see almost every species has missing values (517 out of 571 species) so there is at least no clear indication that the data is missing not at random.

df_trees %>% filter(is.na(date))%>%
  select(species)%>%
  unique() %>% dim()
## [1] 517   1

further analysis of missing date values: percentage of observations with missing date values, per species.

df_trees_datena <- df_trees %>% filter(is.na(date)) %>% 
 count(species)
 
df_trees_species_count <- df_trees %>% count(species)

df_trees_merge <- dplyr::left_join(df_trees_species_count, df_trees_datena, by="species", suffix = c("total", "date.na"))
df_trees_merge$percent.missing <- df_trees_merge$ndate.na / df_trees_merge$ntotal
df_trees_merge %>%
  arrange(desc(ntotal), desc(percent.missing)) %>%
  head()
##                                          species ntotal ndate.na
## 1                                     Tree(s) ::  11629     1563
## 2 Platanus x hispanica :: Sycamore: London Plane  11557     9773
## 3  Metrosideros excelsa :: New Zealand Xmas Tree   8744     6470
## 4          Lophostemon confertus :: Brisbane Box   8581     4481
## 5          Tristaniopsis laurina :: Swamp Myrtle   7197     3682
## 6         Pittosporum undulatum :: Victorian Box   7122     5080
##   percent.missing
## 1       0.1344054
## 2       0.8456347
## 3       0.7399360
## 4       0.5222002
## 5       0.5116021
## 6       0.7132828

We look at the total number of observations we would lose if we drop species where missing 100% of date information. And the number is very small relative to total number of observations (n = 192987)

df_trees_merge %>% filter(percent.missing==1) %>% select(ntotal) %>%
  sum()
## [1] 731

How the percentage of missing date values per species is distributed. All species that have 100% of date data missing have a very low number of observations (n <=100).

df_trees_merge %>% 
  ggplot(aes(percent.missing)) + geom_histogram(fill='skyblue4') + 
  labs(title='Percentage of Missing Date Values per Species') +
  xlab('Percentage of missing date') + ylab('Count') + 
  scale_x_continuous(breaks=seq(0,1, .10)) +theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Therefore, we now filter the cases based on this 100 ntotal threshold to get a cleaner histogram.

df_trees_merge %>% filter(ntotal>100) %>%
  ggplot(aes(percent.missing))+geom_histogram(fill='skyblue4') + 
  labs(title='Percentage of Missing Date Values per Species') +
  xlab('Percentage of missing date') + ylab('Count') + 
  scale_x_continuous(breaks=seq(0,1, .10)) + theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Exploratory Analysis

Checking year of plantation range. most of the trees (50%) were planted between 2001 and 2020. However, it is likely that many of the older trees simply were not recorded so they either dont appear in our data or have a missing date value.

df_trees %>%
  select(year)%>%
  quantile(na.rm = TRUE)
##   0%  25%  50%  75% 100% 
## 1955 1995 2001 2008 2020

Create a new column that shows age using the year column to calculate it. Use the more modern approach mutate(). Then use age column to get an overview of the distribution of tree ages via a histogram.

df_trees <- df_trees %>% 
  mutate(age = year(Sys.Date())-year)


hist(df_trees$age, main = 'Tree Age Distribution', xlab = 'Age', col = 'royalblue2') # using base r

Count the species

df_trees %>%
  count(species_lat) %>%
  arrange(desc(n)) %>%
  top_n(n = 20)
## Selecting by n
##                             species_lat     n
## 1                              Tree(s)  11629
## 2                 Platanus x hispanica  11557
## 3                 Metrosideros excelsa   8744
## 4                Lophostemon confertus   8581
## 5                Tristaniopsis laurina   7197
## 6                Pittosporum undulatum   7122
## 7                    Prunus cerasifera   6716
## 8                 Magnolia grandiflora   6285
## 9                     Arbutus 'Marina'   5702
## 10 Ficus microcarpa nitida 'Green Gem'   5624
## 11          Prunus serrulata 'Kwanzan'   4025
## 12                  Acacia melanoxylon   3956
## 13                     Maytenus boaria   3899
## 14                       Olea europaea   3694
## 15                  Corymbia ficifolia   3575
## 16                Callistemon citrinus   3266
## 17                       Ginkgo biloba   3212
## 18                    Pyrus calleryana   2969
## 19                    Prunus serrulata   2696
## 20                  Eriobotrya deflexa   2402

Show number of distinct species in a street

df_trees %>%
  group_by(address) %>%
  summarize(n = n_distinct(species_lat)) %>%
  arrange(desc(n)) %>%
  head()
## # A tibble: 6 x 2
##   address               n
##   <fct>             <int>
## 1 <NA>                 43
## 2 900x Carolina St     31
## 3 1201 San Jose Ave    30
## 4 1301 Minnesota St    28
## 5 310 Liberty St       26
## 6 1000 San Jose Ave    24

Analysis 1: Evolution of planted species over time

To do this specific analysis we drop the trees, which have missing date data for obvious reasons. Based on our missing date data analysis in the data cleaning section, we are confident the data does not display any obvious signs of MNAR.

Selected related columns

df_trees_hasdate <- df_trees %>% 
  filter(!is.na(date)) %>% 
  select(c("tree_id", "species_lat", "year"))

str(df_trees_hasdate)
## 'data.frame':    68377 obs. of  3 variables:
##  $ tree_id    : int  53719 30313 30312 30314 30315 30316 48435 30319 30318 30320 ...
##  $ species_lat: chr  "Tree(s) " "Tree(s) " "Tree(s) " "Pittosporum undulatum " ...
##  $ year       : num  1955 1955 1955 1955 1955 ...

Species_lat is saved as a factor.

df_trees_hasdate$species_lat <- as.factor(df_trees_hasdate$species_lat)

To make the number of species more manageable, we group them by family names (i.e Acacia, Magnolia)

unique(df_trees_hasdate[grep(df_trees_hasdate$species_lat, pattern = "^Acacia", ignore.case = TRUE), 'species_lat'])
## [1] Acacia melanoxylon           Acacia longifolia           
## [3] Acacia stenophylla           Acacia baileyana            
## [5] Acacia baileyana 'Purpurea'  Acacia cognata              
## [7] Acacia spp                   Acacia decurrens            
## 427 Levels:  Acacia baileyana  Acacia baileyana 'Purpurea'  ... Zelkova serrata 'Village Green'
unique(df_trees_hasdate[grep(df_trees_hasdate$species_lat, pattern = "^Magnolia", ignore.case = TRUE), 'species_lat'])
##  [1] Magnolia grandiflora                   
##  [2] Magnolia spp                           
##  [3] Magnolia grandiflora 'Little Gem'      
##  [4] Magnolia champaca                      
##  [5] Magnolia doltsopa                      
##  [6] Magnolia grandiflora 'Samuel Sommer'   
##  [7] Magnolia grandiflora 'Saint Mary'      
##  [8] Magnolia grandiflora 'Russet'          
##  [9] Magnolia x soulangiana                 
## [10] Magnolia x foggii 'Jack Fogg'          
## [11] Magnolia grandiflora 'Majestic Beauty' 
## [12] Magnolia grandiflora 'D.D. Blanchard'  
## [13] Magnolia sargentiana 'Robusta'         
## [14] Magnolia x alba                        
## [15] Magnolia doltsopa 'Silvercloud'        
## 427 Levels:  Acacia baileyana  Acacia baileyana 'Purpurea'  ... Zelkova serrata 'Village Green'

It seems that specific species can be grouped together with a general name. Since the general name is the first word of the species name, we will separate the species name into its words and then use the first word as the species general name.

Separate the species_lat column into its parts

df_trees_hasdate <- df_trees_hasdate %>% 
  separate(species_lat, sep = " ", remove = FALSE, into = c('species_first', 'species_second', 'species_third'))

The code below shows that the number of unique species after recategorizing species_lat is 158.

length(unique(df_trees_hasdate$species_first))
## [1] 158

Create a count column to save number of planted trees by species, per year.

df_trees_hasdate_tidy <- df_trees_hasdate %>%
  group_by(year) %>%
  count(species_first) %>%
  arrange(year, desc(n)) 

As a bit of side note we look at the total number of planted trees per year regardless of species to get a sense of overall tree planting

df_trees_hasdate_tidy %>%
  select(c('year', 'n')) %>%
  aggregate(by = list(df_trees_hasdate_tidy$year), FUN='sum') %>%
  ggplot(aes(x = Group.1, y = n)) + 
  geom_line() + 
  labs(title = 'Total Number of Planted Trees per Year', 
       x = 'Year', y = 'Total number of planted trees') +
  theme_minimal()

Animated Graph

In this step, We’re going to filter our data set to retain only the top 10 planted species for every given year to keep the visuals understandable. Also we drop the species labelled Tree(s) as this obviously tells us nothing about the species.

df_trees_hasdate_formated <- df_trees_hasdate_tidy %>% subset(species_first != 'Tree(s)') %>% 
  group_by(year) %>%
  mutate(rank = rank(-n)) %>%
  filter(rank <=10) 

In this step, we will create individual static graphs for each year.

staticplot = ggplot(df_trees_hasdate_formated, aes(rank, group = species_first, 
                fill = as.factor(species_first), color = as.factor(species_first))) +
  geom_tile(aes(y = n/2,
                height = n,
                width = 0.9), alpha = 0.8, color = NA) +
  geom_text(aes(y = 0, label = paste(species_first, " ")), vjust = 0.2, hjust = 1) +
  geom_text(aes(y=n, label = n, hjust=0)) +
  coord_flip(clip = "off", expand = FALSE) +
  scale_y_continuous(labels = scales::comma) +
  scale_x_reverse() +
  guides(color = FALSE, fill = FALSE) +
  theme(axis.line=element_blank(),
        axis.text.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks=element_blank(),
        axis.title.x=element_blank(),
         axis.title.y=element_blank(),
        legend.position="none",
        panel.background=element_blank(),
        panel.border=element_blank(),
        panel.grid.major=element_blank(),
        panel.grid.minor=element_blank(),
        panel.grid.major.x = element_line( size=.1, color="grey" ),
        panel.grid.minor.x = element_line( size=.1, color="grey" ),
        plot.title=element_text(size=25, hjust=0.5, face="bold", colour="grey", vjust=-1),
        plot.subtitle=element_text(size=18, hjust=0.5, face="italic", color="grey"),
        plot.caption =element_text(size=8, hjust=0.5, face="italic", color="grey"),
        plot.background=element_blank(),
       plot.margin = margin(2,2, 2, 4, "cm"))
anim = staticplot + transition_states(year, transition_length = 4, state_length = 1) +
  view_follow(fixed_x = TRUE)  +
  labs(title = 'Total Number of Planted Trees per Year: {closest_state}',  
       subtitle  =  "Top 10 Species")

For gif

animate(anim, 200, fps = 2,  width = 1200, height = 1000, 
        renderer = gifski_renderer("gganim.gif"))

For Video

# For MP4
for_mp4 <- animate(anim, 200, fps = 5,  width = 1200, height = 1000, renderer = av_renderer()) 
anim_save("animation.mp4", animation = for_mp4 )

Alternative: cumulative total of top 5 species over time

find out which are the top 5 most planted species over entire time period. ‘Trees’ category will be omitted as it tells us nothing about species

df_agg <- aggregate(df_trees_hasdate_tidy$n, by=list(df_trees_hasdate_tidy$species_first), FUN='sum')
df_agg %>%
  arrange(desc(x)) %>%
  top_n(n=6)
## Selecting by x
##         Group.1     x
## 1       Tree(s) 10066
## 2        Prunus  6393
## 3 Tristaniopsis  4652
## 4   Lophostemon  4100
## 5      Magnolia  3773
## 6       Arbutus  3475

create df to be used for cumulative summation

df_top5 <- df_trees_hasdate_tidy %>%
  filter(species_first == 'Prunus' | species_first == 'Tristaniopsis' | species_first == 'Lophostemon' | species_first == 'Magnolia' | species_first =='Arbutus')

Calculate the cumulative summation for each species.

df_top5_cumsum <- df_top5 %>% 
  group_by(species_first) %>% 
  mutate(csum = cumsum(n))
#install.packages('transformr')
library('transformr')

Save the data type of year column as Date

df_top5_cumsum$year <- lubridate::ymd(df_top5_cumsum$year, truncated = 2L)

Create the static plot

species_cumsum <- ggplot(df_top5_cumsum, aes(x=year,y=csum, group=species_first, col=factor(species_first))) + geom_line() + scale_color_manual(values = c('#e41a1c', '#377eb8', '#4daf4a', '#984ea3', '#ff7f00')) + theme_minimal() 

Create the animated plot with transition

anim2 <- species_cumsum + transition_reveal(year) + view_follow(fixed_x = TRUE)  +
  labs(title = 'Cumulative Total of Top 5 Species over Time', x = 'Year', y = 'Cumulative Sum')

Animated graph (gif version):

animate(anim2, 200, fps = 3,  width = 1200, height = 1000, 
        renderer = gifski_renderer("gganim2.gif"))

Analysis 2:

map of recommended species per (https://sfenvironment.org/sites/default/files/fliers/files/sf_tree_guide.pdf) :

recommended_sp <- c(' Japanese Blueberry Tree', ' Flaxleaf Paperbark', ' Red Flowering Gum', ' Flowering Cherry', 
                    ' Little Gem Magnolia', ' Southern Magnolia', ' Weeping Bottlebrush', ' Hybrid Strawberry Tree', 
                    ' Primrose Tree', ' Brisbane Box', ' Mediterranean Fan Palm', ' Fruitless Olive', ' Chilean Soapbark', 
                    " Small-leaf Tristania 'Elegant'", ' Chinese Pistache', ' Trident Maple', ' Chinese Elm', ' Cork Oak', 
                    ' Ginkgo: Autumn Gold', ' Fairmont Ginkgo', ' Ginkgo: Saratoga', ' Autumn Sentinel Ginkgo'
                    )
unique(df_trees[grep(df_trees$species_nor, pattern = "Ginkgo", ignore.case = TRUE), 'species'])
## [1] Ginkgo biloba 'Autumn Gold' :: Ginkgo: Autumn Gold       
## [2] Ginkgo biloba 'Fairmont' :: Fairmont Ginkgo              
## [3] Ginkgo biloba 'Saratoga' :: Ginkgo: Saratoga             
## [4] Ginkgo biloba 'Autumn Sentinel' :: Autumn Sentinel Ginkgo
## 571 Levels: :: ... Ziziphus jujuba :: Jujube
df_trees$species_nor <- as.factor(df_trees$species_nor)
df_trees_filtered <- df_trees %>% 
  filter(species_nor %in% recommended_sp)
unique(df_trees_filtered$species_nor)
##  [1]  Southern Magnolia               Red Flowering Gum             
##  [3]  Hybrid Strawberry Tree          Chinese Elm                   
##  [5]  Brisbane Box                    Weeping Bottlebrush           
##  [7]  Little Gem Magnolia             Small-leaf Tristania 'Elegant'
##  [9]  Chinese Pistache                Fruitless Olive               
## [11]  Japanese Blueberry Tree         Trident Maple                 
## [13]  Flaxleaf Paperbark              Primrose Tree                 
## [15]  Ginkgo: Autumn Gold             Chilean Soapbark              
## [17]  Cork Oak                        Mediterranean Fan Palm        
## [19]  Fairmont Ginkgo                 Ginkgo: Saratoga              
## [21]  Autumn Sentinel Ginkgo          Flowering Cherry              
## 551 Levels:   'Akebono' Cherry  Acacia Spp ...  Yucca

Get the coordinates of the whole dataset

df_trees_filtered %>% 
  summarise(max_lon = max(longitude,  na.rm=TRUE), min_lon = min(longitude,  na.rm=TRUE),
            max_lat = max(latitude,  na.rm=TRUE), min_lat = min(latitude,  na.rm=TRUE))
##     max_lon   min_lon  max_lat  min_lat
## 1 -122.3692 -138.2839 47.27022 37.70793
register_google('AIzaSyB2pTXikWRKM3XgN4WC-p-Fu2mOs8fZK88')
san_francisco = c(left=-122.7, bottom = 37.7, right = -122.3, top = 37.85)
smap <- get_map(san_francisco, maptype = 'hybrid') %>% #get_stamenmap(san_francisco, maptype = 'watercolor')
  ggmap()
## Source : http://tile.stamen.com/terrain/12/651/1582.png
## Source : http://tile.stamen.com/terrain/12/652/1582.png
## Source : http://tile.stamen.com/terrain/12/653/1582.png
## Source : http://tile.stamen.com/terrain/12/654/1582.png
## Source : http://tile.stamen.com/terrain/12/655/1582.png
## Source : http://tile.stamen.com/terrain/12/656/1582.png
## Source : http://tile.stamen.com/terrain/12/651/1583.png
## Source : http://tile.stamen.com/terrain/12/652/1583.png
## Source : http://tile.stamen.com/terrain/12/653/1583.png
## Source : http://tile.stamen.com/terrain/12/654/1583.png
## Source : http://tile.stamen.com/terrain/12/655/1583.png
## Source : http://tile.stamen.com/terrain/12/656/1583.png
## Source : http://tile.stamen.com/terrain/12/651/1584.png
## Source : http://tile.stamen.com/terrain/12/652/1584.png
## Source : http://tile.stamen.com/terrain/12/653/1584.png
## Source : http://tile.stamen.com/terrain/12/654/1584.png
## Source : http://tile.stamen.com/terrain/12/655/1584.png
## Source : http://tile.stamen.com/terrain/12/656/1584.png
smap + geom_point(data = df_trees_filtered, aes(x=longitude, y = latitude, col = species_nor)) #+ scale_color_manual(values = c('#e41a1c', '#377eb8', '#4daf4a', '#984ea3', '#ff7f00'))

Create map with leaflet() package

## for all species

## will be added a color for each species. 

#map center
mlong = -122.4446
mlat  = 37.72

pal <- colorFactor('viridis', domain = recommended_sp)

leaflet(data = df_trees_filtered, 
                width = '100%', 
                height = 800,
                options = leafletOptions(minZoom = 9, maxZoom = 18)) %>% 
        addTiles() %>%
        setView(lng = mlong, lat = mlat, zoom = 12) %>%
        addCircleMarkers(lng = df_trees_filtered$longitude, 
                         lat = df_trees_filtered$latitude,
                         color = ~pal(df_trees_filtered$species_nor),
                         popup = df_trees_filtered$species_nor,
                         label = df_trees_filtered$species_nor, 
                         radius = 4) %>%
      addLayersControl(overlayGroups = df_trees_filtered$species_nor) ## add layers control

but map is too chaotic give large number of species…sooo Shiny to the rescue:

Create a shiny app version of the map above, with a drop down menu to look at each species at a time.

#map center
mlong = -122.4446
mlat  = 37.72

pal <- colorFactor('Paired', domain = recommended_sp, reverse = TRUE)

shinyApp(
  
  # Define the UI
  ui = fluidPage(
      
    # App title
    titlePanel("Recommended Tree Species of San Francisco"),
               
    #tabPanel(a(href='https://sfenvironment.org/sites/default/files/fliers/files/sf_tree_guide.pdf', 'Source')),
      
    # Sidebar layout with input and output definitions
    sidebarLayout(
        
      # Sidebar panel for inputs 
      sidebarPanel(
          
        # First input: Type of data
        
        checkboxGroupInput(inputId = "species", #name of the input, widget
                      label = "Choose a species:",
                      choices = c('All Species',  ' Japanese Blueberry Tree', ' Flaxleaf Paperbark', ' Red Flowering Gum', 
                                  ' Flowering Cherry', ' Little Gem Magnolia', ' Southern Magnolia', ' Weeping Bottlebrush', 
                                  ' Hybrid Strawberry Tree', ' Primrose Tree', ' Brisbane Box', ' Mediterranean Fan Palm', 
                                  ' Fruitless Olive', ' Chilean Soapbark', " Small-leaf Tristania 'Elegant'", ' Chinese Pistache', 
                                  ' Trident Maple', ' Chinese Elm', ' Cork Oak', ' Ginkgo: Autumn Gold', ' Fairmont Ginkgo', 
                                  ' Ginkgo: Saratoga', ' Autumn Sentinel Ginkgo'
                    ), 
                      selected = ' Japanese Blueberry Tree'
        )),
        
      # Main panel for displaying outputs
      mainPanel(
          
        # Hide errors
        tags$style(type = "text/css",
                     ".shiny-output-error { visibility: hidden; }",
                     ".shiny-output-error:before { visibility: hidden; }"),
          
        # Output: interactive world map
        leafletOutput("map", height = 800)
          
        )
      )
    ),
    
  # Define the server
  server = function(input, output) {
    
    filteredData <- reactive({
      if (input$species == "All Species") {
        df_trees_filtered
      } else {
        filter(df_trees_filtered, species_nor == input$species)
      }
    })
      
    output$map <- renderLeaflet({
          
        leaflet(filteredData(), 
                width = '100%', 
                options = leafletOptions(minZoom = 9, maxZoom = 18)) %>% 
        addTiles() %>%
        setView(lng = mlong, lat = mlat, zoom = 12) %>%
        addCircleMarkers(lng = filteredData()$longitude, 
                           lat = filteredData()$latitude,
                           popup = filteredData()$species_nor,
                           color = ~pal(filteredData()$species_nor),
                           label = ~species_nor, 
                           radius = 4, 
                           fillOpacity = 0.99)
    })
  }
)

Shiny applications not supported in static R Markdown documents

#Analysis 3: - another map with rare (n=1) species

create list of rare species with name matching original df_trees

species_rare <-  df_trees_species_count%>%
  filter(n == 1) %>%
  separate(species, sep = "::", remove = FALSE, into = c('species_lat', 'species_nor'))

use list of rare species to subset original df_trees and also group them by common first name (i.e family name) to reduce species number

df_trees_rare <- df_trees %>%
  filter(species_nor %in% species_rare$species_nor) %>%
  separate(species_lat, sep = " ", remove = FALSE, into = c('species_first', 'species_second', 'species_third'))

plot map with ggmap

smap + geom_point(data = df_trees_rare, aes(x=longitude, y = latitude, col = species_first))

plot map using leaflet, to create a hover-over dot pop up with species name, given not enough colors to differentiate so many species

#map center
mlong = -122.4446
mlat  = 37.72


rare_species_vec <- unique(df_trees_rare$species_lat)
pal <- colorFactor('Paired', domain = rare_species_vec)

leaflet(data = df_trees_rare, 
                width = '100%', height = 800,
                options = leafletOptions(minZoom = 9, maxZoom = 18)) %>% 
        addTiles() %>%
        setView(lng = mlong, lat = mlat, zoom = 12) %>%
        addCircleMarkers(lng = df_trees_rare$longitude, 
                         lat = df_trees_rare$latitude,
                         popup = df_trees_rare$species_lat,
                         label = df_trees_rare$species_lat, 
                         radius = 4,
                         fillOpacity = .5)